Setup

Necessary Libraries

library(MicrobeR)
library(dada2)
library(vegan)
library(ape)
library(philr)
library(lmerTest)
library(tidyverse)
library(readxl)
library(phyloseq)
library(ggtree)
library(qiime2R)
library(ALDEx2)
library(gghighlight)
library(patchwork)
library(ggpubr)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0      patchwork_1.1.1   gghighlight_0.3.0 ALDEx2_1.18.0    
##  [5] qiime2R_0.99.34   ggtree_2.0.4      phyloseq_1.30.0   readxl_1.3.1     
##  [9] forcats_0.5.0     stringr_1.4.0     dplyr_0.8.5       purrr_0.3.4      
## [13] readr_1.3.1       tidyr_1.0.2       tibble_3.0.1      ggplot2_3.3.0    
## [17] tidyverse_1.3.0   lmerTest_3.1-2    lme4_1.1-23       Matrix_1.2-18    
## [21] philr_1.12.0      ape_5.3           vegan_2.5-6       lattice_0.20-38  
## [25] permute_0.9-5     dada2_1.14.1      Rcpp_1.0.4        MicrobeR_0.3.2   
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.6             Hmisc_4.4-0                
##   [3] fastmatch_1.1-0             plyr_1.8.6                 
##   [5] igraph_1.2.5                lazyeval_0.2.2             
##   [7] splines_3.6.3               BiocParallel_1.20.1        
##   [9] GenomeInfoDb_1.22.1         rtk_0.2.5.8                
##  [11] digest_0.6.25               foreach_1.5.0              
##  [13] htmltools_0.5.0             fansi_0.4.1                
##  [15] checkmate_2.0.0             magrittr_1.5               
##  [17] memoise_1.1.0               cluster_2.1.0              
##  [19] DECIPHER_2.14.0             openxlsx_4.1.5             
##  [21] Biostrings_2.54.0           modelr_0.1.6               
##  [23] RcppParallel_5.0.0          matrixStats_0.56.0         
##  [25] jpeg_0.1-8.1                colorspace_1.4-1           
##  [27] blob_1.2.1                  rvest_0.3.5                
##  [29] haven_2.2.0                 xfun_0.13                  
##  [31] crayon_1.3.4                RCurl_1.98-1.2             
##  [33] jsonlite_1.6.1              survival_3.1-8             
##  [35] phangorn_2.5.5              iterators_1.0.12           
##  [37] glue_1.4.0                  gtable_0.3.0               
##  [39] zlibbioc_1.32.0             XVector_0.26.0             
##  [41] DelayedArray_0.12.3         car_3.0-8                  
##  [43] Rhdf5lib_1.8.0              BiocGenerics_0.32.0        
##  [45] abind_1.4-5                 scales_1.1.0               
##  [47] DBI_1.1.0                   rstatix_0.6.0              
##  [49] htmlTable_1.13.3            viridisLite_0.3.0          
##  [51] tidytree_0.3.3              foreign_0.8-75             
##  [53] bit_1.1-15.2                Formula_1.2-3              
##  [55] stats4_3.6.3                DT_0.13                    
##  [57] truncnorm_1.0-8             htmlwidgets_1.5.1          
##  [59] httr_1.4.1                  RColorBrewer_1.1-2         
##  [61] acepack_1.4.1               ellipsis_0.3.0             
##  [63] pkgconfig_2.0.3             NADA_1.6-1.1               
##  [65] nnet_7.3-12                 dbplyr_1.4.3               
##  [67] tidyselect_1.0.0            rlang_0.4.5                
##  [69] reshape2_1.4.4              munsell_0.5.0              
##  [71] cellranger_1.1.0            tools_3.6.3                
##  [73] cli_2.0.2                   generics_0.0.2             
##  [75] RSQLite_2.2.0               ade4_1.7-15                
##  [77] broom_0.5.6                 evaluate_0.14              
##  [79] biomformat_1.14.0           yaml_2.2.1                 
##  [81] knitr_1.28                  bit64_0.9-7                
##  [83] fs_1.4.1                    zip_2.0.4                  
##  [85] nlme_3.1-144                xml2_1.3.2                 
##  [87] rstudioapi_0.11             compiler_3.6.3             
##  [89] curl_4.3                    plotly_4.9.2.1             
##  [91] png_0.1-7                   ggsignif_0.6.0             
##  [93] zCompositions_1.3.4         reprex_0.3.0               
##  [95] treeio_1.10.0               statmod_1.4.34             
##  [97] stringi_1.4.6               nloptr_1.2.2.1             
##  [99] multtest_2.42.0             vctrs_0.2.4                
## [101] pillar_1.4.3                lifecycle_0.2.0            
## [103] BiocManager_1.30.10         data.table_1.12.8          
## [105] bitops_1.0-6                GenomicRanges_1.38.0       
## [107] R6_2.4.1                    latticeExtra_0.6-29        
## [109] hwriter_1.3.2               ShortRead_1.44.3           
## [111] rio_0.5.16                  gridExtra_2.3              
## [113] IRanges_2.20.2              codetools_0.2-16           
## [115] boot_1.3-24                 MASS_7.3-51.5              
## [117] assertthat_0.2.1            picante_1.8.1              
## [119] rhdf5_2.30.1                SummarizedExperiment_1.16.1
## [121] withr_2.2.0                 GenomicAlignments_1.22.1   
## [123] Rsamtools_2.2.3             S4Vectors_0.24.4           
## [125] GenomeInfoDbData_1.2.2      mgcv_1.8-31                
## [127] parallel_3.6.3              hms_0.5.3                  
## [129] rpart_4.1-15                quadprog_1.5-8             
## [131] grid_3.6.3                  minqa_1.2.4                
## [133] rmarkdown_2.1               rvcheck_0.1.8              
## [135] carData_3.0-4               base64enc_0.1-3            
## [137] numDeriv_2016.8-1.1         Biobase_2.46.0             
## [139] lubridate_1.7.8

Theme

Whitecolor="#E69F00"
Chinesecolor="#0072B2"

# theme for pcoas
theme_pcoa<- function () { 
  theme_classic(base_size=10, base_family="Helvetica") +
  theme(axis.text = element_text(size=8, color = "black"), 
        axis.title = element_text(size=10, color="black"), 
        legend.text = element_text(size=8, color = "black"), 
        legend.title = element_text(size=10, color = "black"), 
        plot.title = element_text(size=10, color="black")) +
  theme(panel.border = element_rect(color="black", size=1, fill=NA))
}

# theme for boxplots
theme_boxplot<- function () { 
  theme_classic(base_size=10, base_family="Helvetica") +
  theme(axis.text.x = element_text(size=10, color = "black"),
        axis.text.y = element_text(size=8, color="black"),
        axis.title.x= element_blank(),
        axis.title.y = element_text(size=10, color="black"), 
        legend.position = "none")
}

Data Import

metadata<-read_excel("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/metadata_IDEO_cohort.xlsx") %>% column_to_rownames("SampleID")

SVtab<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/16S_IDEO_FonlyNewPipeline/Output/ASV_table.qza")$data %>% as.data.frame() 

SVseq<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/16S_IDEO_FonlyNewPipeline/Output/ASV_sequences.qza")$data %>% as.data.frame() %>% rename("SV"=x)

taxonomy<-read.delim("/Volumes/turnbaughlab/qb3share/qiyanang/16S_IDEO_FonlyNewPipeline/Output/ASV_d2taxonomy.txt", header=T)

lookup<-(SVseq %>% rownames_to_column("ASV")) %>%
  left_join(taxonomy, by="ASV") %>%
  column_to_rownames("ASV")
## Warning: Column `ASV` joining character vector and factor, coercing into
## character vector
# Tree
tree<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/16S_IDEO_FonlyNewPipeline/Output/ASV_denovotree.qza")$data

Assign SV numbers

Assignment rates across levels

lookup %>%
  select(-SV) %>%
  apply(., MARGIN=2, function(x) sum(!is.na(x))/nrow(taxonomy)*100)
##           Kingdom            Phylum             Class             Order 
##        100.000000         99.889105         99.500970         97.588023 
##            Family             Genus           Species Species_ambiguous 
##         91.627391         71.971167          8.566676         13.473801 
##          Sequence 
##        100.000000

Filter SV table

# Correct typos in SVtab colnames
names(SVtab) <- gsub(x = names(SVtab), pattern = "0B0", replacement = "OB0")  

# Subset SVtab to contain only White and Chinese samples
SVtab<-SVtab[,rownames(metadata)]
histogram(colSums(SVtab))

print("Read count for samples used in downstream analysis range 72091-319590")
## [1] "Read count for samples used in downstream analysis range 72091-319590"

Visualize tree

gtree<-ggtree(tree)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
gtree<-gtree %<+% ((tibble(Tip=tree$tip.label)) %>% left_join(lookup %>% rownames_to_column("Tip")))
## Joining, by = "Tip"
gtree +
  geom_tippoint(aes(color=Phylum)) 

Quality Filter

SVtab<-Confidence.Filter(SVtab, MINSAMPS = 2, MINREADS=10, VERBOSE=TRUE)
lookup<-lookup[rownames(SVtab),]
tree<-drop.tip(tree, tree$tip.label[!tree$tip.label %in% rownames(lookup)])

Beta Diversity: PhILR Euclidean

PCoA plot

#check sample order is the same in SVtab and metadata
colnames(SVtab)
##  [1] "OB006"    "OB008"    "OB011"    "OB014"    "OB018"    "OB020"   
##  [7] "OB028.V1" "OB036"    "OB039"    "OB042"    "OB045"    "OB046"   
## [13] "OB050.V1" "OB056.V1" "OB059"    "OB063"    "OB064"    "OB066"   
## [19] "OB067"    "OB068"    "OB069"    "OB071"    "OB081"    "OB082"   
## [25] "OB012"    "OB021"    "OB022"    "OB024"    "OB026"    "OB027"   
## [31] "OB030"    "OB035"    "OB040"    "OB041"    "OB043"    "OB044"   
## [37] "OB051"    "OB055"    "OB057"    "OB070"    "OB072"    "OB078"   
## [43] "OB079"    "OB084"    "OB085"    "OB088"
rownames(metadata)
##  [1] "OB006"    "OB008"    "OB011"    "OB014"    "OB018"    "OB020"   
##  [7] "OB028.V1" "OB036"    "OB039"    "OB042"    "OB045"    "OB046"   
## [13] "OB050.V1" "OB056.V1" "OB059"    "OB063"    "OB064"    "OB066"   
## [19] "OB067"    "OB068"    "OB069"    "OB071"    "OB081"    "OB082"   
## [25] "OB012"    "OB021"    "OB022"    "OB024"    "OB026"    "OB027"   
## [31] "OB030"    "OB035"    "OB040"    "OB041"    "OB043"    "OB044"   
## [37] "OB051"    "OB055"    "OB057"    "OB070"    "OB072"    "OB078"   
## [43] "OB079"    "OB084"    "OB085"    "OB088"
#PhILR
PHILR<-philr(
            t(SVtab+1), 
            tree, 
            part.weights='enorm.x.gm.counts', 
            ilr.weights='blw.sqrt'
            )
## Building Sequential Binary Partition from Tree...
## Building Contrast Matrix...
## Transforming the Data...
## Calculating ILR Weights...
## Warning in calculate.blw(tree, method = "sum.children"): Note: a total of 105
## tip edges with zero length have been replaced with a small pseudocount of the
## minimum non-zero edge length ( 5e-09 ).
#Adonis
adonis<-vegan::adonis(dist(PHILR, method="euclidean") ~ Ethnicity, data=metadata, permutations = 10000)
adonis$aov.tab
## Permutation: free
## Number of permutations: 10000
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model     R2   Pr(>F)   
## Ethnicity  1      2659  2659.2  2.0975 0.0455 0.005799 **
## Residuals 44     55783  1267.8         0.9545            
## Total     45     58443                 1.0000            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Calculate PCo axis values
euclid<-ape::pcoa(dist(PHILR, method="euclidean"))
var.PCo1 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[2], digits=2, nsmall=1)
var.PCo3 <- format(100*(euclid$values$Eigenvalues/sum(euclid$values$Eigenvalues))[3], digits=2, nsmall=1)

#Scree plot
euclid$values %>%
  as.data.frame() %>%
  rownames_to_column("PC") %>%
  mutate(PC=as.numeric(PC)) %>%
  select(PC, Relative_eig) %>%
  mutate(PercentVariation=Relative_eig*100) %>%
  ggplot(aes(x=PC, y=PercentVariation)) +
  geom_point()

#Plot pcoa
plot_1a <- euclid$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle(paste("p = ",round(adonis$aov.tab$`Pr(>F)`[1],3),", R2 = ", round(adonis$aov.tab$R2[1],3)))
## Joining, by = "SampleID"
plot_1a

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/pcoa_philr.pdf", height=3, width=3.7, useDingbats=F)
ggsave("pcoa_philr.pdf", height=2.5, width=3.2, useDingbats=F)

# source data
sourcedat1a <- 
  euclid$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>% 
  select(SubjectID, Ethnicity, Axis.1, Axis.2)
## Joining, by = "SampleID"
#write_tsv(sourcedat1a, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1A.tsv")
write_tsv(sourcedat1a, "Fig1A.tsv")

Compare adonis effect sizes across metadata variables

metadata_adonis<-read_excel("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/metadata_for_adonis.xlsx") %>% column_to_rownames("SampleID")
adonis_classes<-read_excel("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/metadata_for_adonis.xlsx",sheet=2)

metadata_adonis2 <- mutate_if(metadata_adonis, is.character, as.factor)
rownames(metadata_adonis2) <- rownames(metadata_adonis)

philr_dm <- dist(PHILR, method="euclidean")

# loop adonis over variables
df_total = data.frame()
for (i in colnames(metadata_adonis2)) {
  form <- as.formula(paste("philr_dm", i, sep="~"))
  adonis_res<- vegan::adonis(form, data=metadata_adonis2, permutations=10000)
  df <- data.frame(Metric=i, Pval=adonis_res$aov.tab$`Pr(>F)`[1], R2=adonis_res$aov.tab$R2[1])
  df_total <- rbind(df_total, df)
}
df_total
##           Metric       Pval         R2
## 1            Age 0.01739826 0.04009490
## 2            Sex 0.11378862 0.03059461
## 3      Ethnicity 0.00609939 0.04550104
## 4        US_Born 0.64673533 0.01889155
## 5       Diabetes 0.00029997 0.06797536
## 6            BMI 0.04069593 0.03581124
## 7        Insulin 0.03819618 0.03690048
## 8            FPG 0.49185081 0.02091000
## 9        HOMA_IR 0.39146085 0.02273214
## 10   Cholesterol 0.81561844 0.01603931
## 11 Triglycerides 0.29637036 0.02526780
## 12 Energy_intake 0.83771623 0.01453800
## 13       Protein 0.74422558 0.01710164
## 14           Fat 0.33966603 0.02378533
## 15  Carbohydrate 0.24727527 0.02646417
## 16         Fiber 0.16118388 0.02902421
## 17        Statin 0.00779922 0.04531316
## 18     Metformin 0.00079992 0.06051858
## 19       Alcohol 0.82521748 0.01592631
df_total %>%
  arrange(desc(R2)) %>% pull(Metric) -> order

df_total %>%
  left_join(adonis_classes) %>%
  mutate(Metric=factor(Metric, levels=order)) %>%
  ggplot(aes(x=Metric, y=R2, fill=Class)) +
  geom_bar(stat="identity") +
  theme_pcoa()+
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1), legend.position=c(0.8,0.7)) +
  scale_fill_viridis_d() +
  xlab("") + ylab("Effect size (R2)") +
  scale_y_continuous(expand=c(0,0), limits=c(0,0.075))
## Joining, by = "Metric"
## Warning: Column `Metric` joining factor and character vector, coercing into
## character vector

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/adonis_effectsizes.pdf", height=3.5, width=4, useDingbats=F)

Save pcoa output for geographic analysis

Beta Diversity: Other metrics (Tables)

DistanceMatrices<-list()
 
DistanceMatrices$WeightedUniFrac<-UniFrac(phyloseq(otu_table(Make.Proportion(Subsample.Table(SVtab)), taxa_are_rows = T), tree), weighted=T)
## Warning in matrix(tree$edge[order(tree$edge[, 1]), ][, 2], byrow = TRUE, : data
## length [2607] is not a sub-multiple or multiple of the number of rows [1304]
dm_res<-vegan::adonis(DistanceMatrices$WeightedUniFrac ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-data.frame(Metric="Weighted Unifrac", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1])

DistanceMatrices$UnweightedUniFrac<-UniFrac(phyloseq(otu_table(Make.Proportion(Subsample.Table(SVtab)), taxa_are_rows = T), tree), weighted=F)
## Warning in matrix(tree$edge[order(tree$edge[, 1]), ][, 2], byrow = TRUE, : data
## length [2607] is not a sub-multiple or multiple of the number of rows [1304]
dm_res<-vegan::adonis(DistanceMatrices$UnweightedUniFrac ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="Unweighted Unifrac", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

DistanceMatrices$BrayCurtis<-distance(otu_table(Make.Proportion(Subsample.Table(SVtab)), taxa_are_rows=T), method="bray", type="samples")
dm_res<-vegan::adonis(DistanceMatrices$BrayCurtis ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="Bray Curtis", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

DistanceMatrices$JSD<-distance(otu_table(Make.Proportion(Subsample.Table(SVtab)), taxa_are_rows=T), method="jsd", type="samples")
dm_res<-vegan::adonis(DistanceMatrices$JSD ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="Jensen-Shannon Divergence", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

DistanceMatrices$Jaccard<-distance(otu_table(Make.Proportion(Subsample.Table(SVtab)), taxa_are_rows=T), method="jaccard", type="samples")
dm_res<-vegan::adonis(DistanceMatrices$Jaccard ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="Jaccard", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

DistanceMatrices$CLREuclidean<-dist(t(Make.CLR(SVtab)), method="euclidean")
dm_res<-vegan::adonis(DistanceMatrices$CLREuclidean ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="CLR Euclidean", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

DistanceMatrices$PhILREuclidean<-dist(PHILR, method="euclidean")
dm_res<-vegan::adonis(DistanceMatrices$PhILREuclidean ~ Ethnicity, data=metadata, permutations = 10000)
dm_res_combined<-rbind(dm_res_combined, data.frame(Metric="PhILR Euclidean", Pval=dm_res$aov.tab$`Pr(>F)`[1], R2=dm_res$aov.tab$R2[1]))

Nice.Table(dm_res_combined)

Beta Diversity: Other metrics (Plots)

BC

#pcoa
pcbraycurtis<-pcoa(DistanceMatrices$BrayCurtis)

#calculate PCo axis values
var.PCo1 <- format(100*(pcbraycurtis$values$Eigenvalues/sum(pcbraycurtis$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcbraycurtis$values$Eigenvalues/sum(pcbraycurtis$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
bc <- pcbraycurtis$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("Bray Curtis")

Weighted Unifrac

#pcoa
pcWUnif<-pcoa(DistanceMatrices$WeightedUniFrac)

#calculate PCo axis values
var.PCo1 <- format(100*(pcWUnif$values$Eigenvalues/sum(pcWUnif$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcWUnif$values$Eigenvalues/sum(pcWUnif$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
WUnif <- pcWUnif$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("Weighted Unifrac")

Unweighted Unifrac

#pcoa
pcUWUnif<-pcoa(DistanceMatrices$UnweightedUniFrac)

#calculate PCo axis values
var.PCo1 <- format(100*(pcUWUnif$values$Eigenvalues/sum(pcUWUnif$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcUWUnif$values$Eigenvalues/sum(pcUWUnif$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
UWUnif <- pcUWUnif$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("Unweighted Unifrac")

Jaccard

#pcoa
pcJac<-pcoa(DistanceMatrices$Jaccard)

#calculate PCo axis values
var.PCo1 <- format(100*(pcJac$values$Eigenvalues/sum(pcJac$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcJac$values$Eigenvalues/sum(pcJac$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
jac <- pcJac$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("Jaccard") 

JSD

#pcoa
pcJSD<-pcoa(DistanceMatrices$JSD)

#calculate PCo axis values
var.PCo1 <- format(100*(pcJSD$values$Eigenvalues/sum(pcJSD$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcJSD$values$Eigenvalues/sum(pcJSD$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
jsd <- pcJSD$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("Jensen-Shannon Divergence")

CLR Euclidean

#pcoa
pcCLR<-pcoa(DistanceMatrices$CLREuclidean)

#calculate PCo axis values
var.PCo1 <- format(100*(pcCLR$values$Eigenvalues/sum(pcCLR$values$Eigenvalues))[1], digits=2, nsmall=1)
var.PCo2 <- format(100*(pcCLR$values$Eigenvalues/sum(pcCLR$values$Eigenvalues))[2], digits=2, nsmall=1)

#plot pcoa
clrEu <- pcCLR$vectors %>%
  as.data.frame() %>%
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  ggplot(aes(x=Axis.1, y=Axis.2, fill=Ethnicity)) +
  geom_point(size=2, shape=21) +
  theme_pcoa() +
  theme(legend.position = "none") +
  ylab(paste0("PCo2 [",var.PCo2,"%]")) +
  xlab(paste0("PCo1 [",var.PCo1,"%]")) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ggtitle("CLR Euclidean")

Combine pcoa plots

(WUnif | UWUnif | bc) / (jac | jsd | clrEu)

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/pcoa_othermetrics.pdf", height=5, width=7, useDingbats=F)

Alpha Diversity

Plot

alphadiv <- data.frame(
  Shannon = vegan::diversity(Subsample.Table(SVtab), index = "shannon", MARGIN = 2), 
  FaithsPD = picante::pd(t(Subsample.Table(SVtab)), tree, include.root = F)$PD,
  Richness = specnumber(Subsample.Table(SVtab), MARGIN = 2)) %>% #Calc richness on subsampled table
  rownames_to_column("SampleID") %>%
  left_join(metadata %>% rownames_to_column("SampleID")) %>%
  select (SampleID, Ethnicity, BMI, IDEO_BMI_Class, `%BF`,`%VAT_over_FM`, `%VAT_over_weight`,Weight, Shannon, FaithsPD, Richness) %>%
  pivot_longer(cols=Shannon:Richness, names_to="alpha_metric")
## Subsampling feature table to 69340 , currently has  1308  taxa.
## ...sampled to 69340 reads with 1305 taxa
## Subsampling feature table to 69340 , currently has  1308  taxa.
## ...sampled to 69340 reads with 1305 taxa
## Subsampling feature table to 69340 , currently has  1308  taxa.
## ...sampled to 69340 reads with 1305 taxa
## Joining, by = "SampleID"
# plot alpha div metrics
plot_1b <- alphadiv %>%
  ggplot(aes(x=Ethnicity, y=value, fill=Ethnicity)) + 
  geom_boxplot(outlier.shape=NA) +
  geom_jitter(shape=21, size=1, height=0, width=0.1) +
  facet_wrap(~alpha_metric, scales="free", nrow=1) +
  theme_boxplot() +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ylab("Alpha diversity") +
  stat_compare_means(label="p.format", method="wilcox.test", paired = FALSE)
plot_1b

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/alpha_div.pdf", height=2.5, width=5, useDingbats=F)

# mann whitney test 
shannon <- alphadiv %>% filter(alpha_metric=="Shannon")
wilcox.test(value~Ethnicity, data = shannon, alternative = "two.sided", paired = FALSE)
## 
##  Wilcoxon rank sum test
## 
## data:  value by Ethnicity
## W = 117, p-value = 0.000926
## alternative hypothesis: true location shift is not equal to 0
richness <- alphadiv %>% filter(alpha_metric=="Richness")
wilcox.test(value~Ethnicity, data = richness, alternative = "two.sided", paired = FALSE)
## Warning in wilcox.test.default(x = c(271, 147, 196, 181, 268, 300, 281, : cannot
## compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  value by Ethnicity
## W = 135, p-value = 0.004712
## alternative hypothesis: true location shift is not equal to 0
faiths <- alphadiv %>% filter(alpha_metric=="FaithsPD")
wilcox.test(value~Ethnicity, data = faiths, alternative = "two.sided", paired = FALSE)
## 
##  Wilcoxon rank sum test
## 
## data:  value by Ethnicity
## W = 128, p-value = 0.002328
## alternative hypothesis: true location shift is not equal to 0
# source data
sourcedat1b <- 
  alphadiv %>% 
  select(SampleID, Ethnicity, alpha_metric, value) %>% 
  pivot_wider(id_cols=c("SampleID","Ethnicity"), names_from="alpha_metric", values_from="value")
#write_tsv(sourcedat1b, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1B.tsv")

Save alphadiv output for geographic analysis

Phylum Abundances

Summarize.Taxa(SVtab, lookup)$Phylum %>%  
  Make.CLR() %>%
  as.data.frame() %>%
  rownames_to_column("Feature") %>%
  gather(-Feature, key="SampleID", value="Abundance") %>% 
  left_join(metadata %>% rownames_to_column("SampleID"), by="SampleID") %>%
  select(Feature, SampleID, Abundance, SubjectID, Ethnicity) -> data
## WARNING: CLR being applied with relatively few features.
# wilcox test on phyla abundances
data %>%
  group_by(Feature) %>%
  do(
    broom::glance(wilcox.test(Abundance~Ethnicity, data=., paired=FALSE))
  ) %>%
  ungroup() -> results.phylum
Nice.Table(results.phylum)
# top phyla by mean abundance
data %>%
  filter(!Feature %in% c("Archaea;Euryarchaeota","Bacteria;NA")) %>% 
  group_by(Feature) %>%
  summarize(AvgAbundance=mean(Abundance)) %>%
  arrange(desc(AvgAbundance)) %>% 
  separate(Feature, sep=";", into=c("Kingdom","Phylum")) %>% 
  pull(Phylum) -> order

# plot phyla
plot_1c <- 
  data %>%
  filter(!Feature %in% c("Archaea;Euryarchaeota","Bacteria;NA")) %>% 
  separate(Feature, sep=";", into=c("Kingdom","Phylum")) %>% 
  mutate(Phylum=factor(Phylum, levels=order)) %>%
  ggplot(aes(x=Ethnicity, y=Abundance, fill=Ethnicity)) +
  geom_boxplot(outlier.shape=NA) +  
  geom_jitter(height=0, width=0.1, size=1, shape=21) +
  facet_wrap(~Phylum, nrow=2, scales="free") +
  theme_boxplot() +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  ylab("Abundance (CLR)") +
  stat_compare_means(label="p.format", method="wilcox.test", paired = FALSE)

plot_1c

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/16S_human_phyla_all.pdf", height=4, width=7.5, useDingbats=F)

# source data
sourcedat1c <- 
  data %>%
  filter(!Feature %in% c("Archaea;Euryarchaeota","Bacteria;NA")) %>% 
  separate(Feature, sep=";", into=c("Kingdom","Phylum")) %>% 
  mutate(Phylum=factor(Phylum, levels=order)) %>% 
  select(SubjectID, Ethnicity, Phylum, Abundance) %>% 
  pivot_wider(id_cols=c("SubjectID","Ethnicity"),names_from="Phylum",values_from="Abundance")
#write_tsv(sourcedat1c, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1c.tsv")

Stacked bar plots at genus levels

Summarize.Taxa(SVtab, lookup)$Genus %>%
  Make.Percent() -> genera
genera[order(rowMeans(genera), decreasing = T),] -> genera
Main <- genera[1:10,]
Remainder <- colSums(genera[11:243,])
Features <- rbind(Main, Remainder)
rownames(Features) -> order
Features %>% 
  as.data.frame() %>% 
  rownames_to_column("Taxa") %>% 
  separate(Taxa, ";", into = c("1","2","3","4","Family","Genus"), fill="left") %>% 
  pull(Genus) -> order

toplot <-
  Features %>%
  as.data.frame() %>%
  rownames_to_column("Taxa") %>%
  gather(-Taxa, key="Sample", value="Abundance") %>%
  inner_join(metadata %>% rownames_to_column("Sample") %>% select(Sample, Ethnicity), by="Sample") %>% 
  group_by(Taxa, Ethnicity) %>% 
  summarize(AvgAbundance = mean(Abundance)) %>% 
  ungroup() %>% 
  separate(Taxa, ";", into = c("1","2","3","4","Family","Genus"), fill="left") %>%
  mutate(Genus=factor(Genus, levels=rev(order))) 

plot_1d <- 
  toplot %>%
  ggplot(aes(x=Ethnicity, y=AvgAbundance, fill=Genus)) +
  geom_bar(stat="identity") +
  theme_classic() +
  ylab("Average relative abundance (%)") +
  theme(legend.text = element_text(size=6), axis.title.x=element_blank(), axis.title.y= element_text(size=10), axis.text.x=element_text(size=10), axis.text.y=element_text(size=8))
plot_1d

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/16S_stacked_bar_genera.pdf", height=2.5, width=3.5, useDingbats=F)

# source data
sourcedat1d <- toplot %>% select(Ethnicity, Genus, AvgAbundance)
#write_tsv(sourcedat1d, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1d.tsv")

Stacked bar plots for individuals

# Phylum level
Summarize.Taxa(SVtab, lookup)$Phylum %>%
  Make.Percent() -> phyla
phyla[order(rowMeans(phyla), decreasing = T),] -> phyla
Main <- phyla[1:6,]
Remainder <- colSums(phyla[7:12,])
Features <- rbind(Main, Remainder)
rownames(Features) -> order

Features %>%
  as.data.frame() %>%
  rownames_to_column("Taxa") %>%
  gather(-Taxa, key="Sample", value="Abundance") %>%
  inner_join(metadata %>% rownames_to_column("Sample") %>% select(Sample, SubjectID, Ethnicity), by="Sample") %>% 
  mutate(Taxa=factor(Taxa, levels=rev(order))) %>%
  ggplot(aes(x=SubjectID,y=Abundance, fill=Taxa)) +
  geom_bar(stat="identity") +
  theme_classic() +
  ylab("Relative abundance (%)") +
  theme(legend.text = element_text(size=6), axis.title.x=element_blank(), axis.title.y= element_text(size=10), axis.text.x=element_text(size=8, angle=45, hjust=1), axis.text.y=element_text(size=8)) +
  facet_grid(~Ethnicity, scales="free") +
  scale_y_continuous(expand = c(0,0))
#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/16S_stacked_bar_phyla_individual.pdf", height=3.5, width=8, useDingbats=F)

# Genus level
Summarize.Taxa(SVtab, lookup)$Genus %>%
  Make.Percent() -> genera
genera[order(rowMeans(genera), decreasing = T),] -> genera
Main <- genera[1:10,]
Remainder <- colSums(genera[11:243,])
Features <- rbind(Main, Remainder)
rownames(Features) -> order
Features %>% 
  as.data.frame() %>% 
  rownames_to_column("Taxa") %>% 
  separate(Taxa, ";", into = c("1","2","3","4","Family","Genus"), fill="left") %>% 
  pull(Genus) -> order

Features %>%
  as.data.frame() %>%
  rownames_to_column("Taxa") %>%
  gather(-Taxa, key="Sample", value="Abundance") %>%
  inner_join(metadata %>% rownames_to_column("Sample") %>% select(Sample, SubjectID, Ethnicity), by="Sample") %>% 
  separate(Taxa, ";", into = c("1","2","3","4","Family","Genus"), fill="left") %>%
  mutate(Genus=factor(Genus, levels=rev(order))) %>%
  ggplot(aes(x=SubjectID, y=Abundance, fill=Genus)) +
  geom_bar(stat="identity") +
  theme_classic() +
  ylab("Relative abundance (%)") +
  theme(legend.text = element_text(size=6), axis.title.x=element_blank(), axis.title.y= element_text(size=10), axis.text.x=element_text(size=8, angle=45, hjust=1), axis.text.y=element_text(size=8)) +
  facet_grid(~Ethnicity, scales="free") +
  scale_y_continuous(expand = c(0,0))
#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/16S_stacked_bar_genera_individual.pdf", height=3.5, width=8, useDingbats=F)

Differential Abundance on genera: Aldex2

# summarize table to genus level and filter >0.05% reads
genera<-Summarize.Taxa(SVtab, lookup)$Genus
f.genera<-Fraction.Filter(genera,0.0005)
## [1] "Filtering table at a min fraction of 5e-04 of feature table..."
## [1] "...There are 9918566 reads and 243  features"
## [1] "...After filtering there are 9758627 reads and 98 OTUs"
# aldex
results <- aldex(f.genera, metadata$Ethnicity, mc.samples = 128, denom = "all", test = "t", effect = TRUE, include.sample.summary = TRUE)
## aldex.clr: generating Monte-Carlo instances and clr values
## operating in serial mode
## computing center with all features
## aldex.ttest: doing t-test
## aldex.effect: calculating effect sizes
results_genera <-
results %>%
  rownames_to_column("Feature") %>%
  select(Feature, 
         logFC_Between=diff.btw, 
         logFC_Within=diff.win,
         Abundance_EA=rab.win.EA,
         Abundance_W=rab.win.W,
         Pvalue=we.ep, 
         FDR=we.eBH, 
         EffectSize=effect) %>%
  mutate(logFC_EA_vs_W=-(logFC_Between)) %>%
  separate(Feature,sep=";",into=c("K","P","C","O","F","Genus"),remove=F)

# volcano plot
plot_1e <- ggplot(results_genera, aes(x = logFC_EA_vs_W, y=-log10(FDR))) + 
  geom_point() + 
  gghighlight(FDR < 0.1 & abs(logFC_Between)>1, label_key = Genus) +
  theme_bw() +
  xlab("Log2 fold difference (EA/W)") +
  ylab("-Log10(FDR)")
plot_1e

# source data
sourcedat1e <-
  results_genera %>% 
  select("K":"F", Genus, Pvalue, FDR, logFC_EA_vs_W)
#write_tsv(sourcedat1e, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1e.tsv")

Differential Abundance on ASVs: Aldex2

# filter for features at least 0.05% reads
f.SVtab<-Fraction.Filter(SVtab,0.0005)
## [1] "Filtering table at a min fraction of 5e-04 of feature table..."
## [1] "...There are 9918566 reads and 1308  features"
## [1] "...After filtering there are 9090359 reads and 227 OTUs"
# save this matrix for VU to make circular tree figure (run only once)
 # f.SVtab %>% as.data.frame() %>% rownames_to_column("ASV") -> tosave
 # write_csv(tosave,"/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/ASV_matrix_for_aldex_tree.csv")

# aldex
results <- aldex(f.SVtab, metadata$Ethnicity, mc.samples = 128, denom = "all", test = "t", effect = TRUE, include.sample.summary = TRUE)
## aldex.clr: generating Monte-Carlo instances and clr values
## operating in serial mode
## computing center with all features
## aldex.ttest: doing t-test
## aldex.effect: calculating effect sizes
results_SVs <-
results %>%
  rownames_to_column("hash") %>%
  select(hash, 
         logFC_Between=diff.btw, 
         logFC_Within=diff.win,
         Abundance_EA=rab.win.EA,
         Abundance_W=rab.win.W,
         Pvalue=we.ep, 
         FDR=we.eBH, 
         EffectSize=effect) %>%
  left_join(lookup %>% rownames_to_column("hash")) %>%
  mutate(logFC_EA_vs_W=-(logFC_Between)) %>%
  arrange(FDR)
## Joining, by = "hash"
# significant results
sigres_SVs <-
results_SVs %>%
  filter(FDR<0.1 & abs(logFC_Between)>1)
Nice.Table(sigres_SVs)
# volcano plot
plot_1f <- ggplot(results_SVs, aes(x = logFC_EA_vs_W, y=-log10(FDR))) + 
  geom_point() + 
  gghighlight(FDR < 0.1 & abs(logFC_Between)>1, label_key = Genus) +
  theme_bw() +
  xlab("Log2 fold difference (EA/W)") +
  ylab("-Log10(FDR)") +
  xlim(-7,6)
plot_1f

# source data
sourcedat1f <-
  results_SVs %>% 
  select(SV, Kingdom, Phylum, Class, Order, Family, Genus, Species, Species_ambiguous, Pvalue, FDR, logFC_EA_vs_W) %>% 
  rename(ASV = SV)
#write_tsv(sourcedat1f, "/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/sourcedata/Fig1f.tsv")

Correlation bw alpha div and metabolic parameters

# correlate FaithsPD to BMI
  ## white: correlation
bmi_white <- alphadiv %>% filter(alpha_metric=="FaithsPD" & Ethnicity=="W") 
cor.test(~value+BMI, data=bmi_white, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(15.473810632, 12.618487713, 16.065054884, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 3844.3, p-value = 0.0003277
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6714503
  ## chinese: correlations
bmi_chinese <- alphadiv %>% filter(alpha_metric=="FaithsPD" & Ethnicity=="EA") 
cor.test(~value+BMI, data=bmi_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(17.433724333, 10.718564998, 13.875079969, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 2269.3, p-value = 0.2046
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.281356
# plot FaithsPD to BMI
faith_bmi <- alphadiv %>%
  filter(alpha_metric=="FaithsPD") %>%
  ggplot(aes(x=as.numeric(BMI), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +
  theme(legend.position = "none") +
  xlab("BMI") +
  ylab("Faith's PD")

# correlate FaithsPD to %BF
  ## white: correlation
vat_white <- alphadiv %>% filter(alpha_metric=="FaithsPD" & Ethnicity=="W")
cor.test(~value+as.numeric(`%BF`), data=vat_white, method="spearman",conf.level=0.95)
## Warning in eval(predvars, data, env): NAs introduced by coercion
## Warning in cor.test.default(x = c(15.473810632, 12.618487713, 15.3622618, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 3353.3, p-value = 0.0006634
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6567828
 ## chinese: correlation
vat_chinese <- alphadiv %>% filter(alpha_metric=="FaithsPD" & Ethnicity=="EA") 
cor.test(~value+as.numeric(`%BF`), data=vat_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(17.433724333, 10.718564998, 13.875079969, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 1454.9, p-value = 0.4268
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1784807
# plot FaithsPD to %BF
faith_vat <- alphadiv %>%
  filter(alpha_metric=="FaithsPD") %>%
  ggplot(aes(x=as.numeric(`%BF`), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  theme(legend.position = "none") +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +
  theme(legend.position = "none") +
  xlab("Body fat, %") +
  ylab("Faith's PD")

# correlate Shannon to BMI
  ## white: correlation
bmi_white <- alphadiv %>% filter(alpha_metric=="Shannon" & Ethnicity=="W") 
cor.test(~value+BMI, data=bmi_white, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(3.47753869036769, 4.01093947025265,
## 3.93037963333439, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 3878.3, p-value = 0.0002136
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6862362
  ## chinese: correlations
bmi_chinese <- alphadiv %>% filter(alpha_metric=="Shannon" & Ethnicity=="EA") 
cor.test(~value+BMI, data=bmi_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(3.75391221497536, 3.28392747785931,
## 3.65868347495839, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 2732.5, p-value = 0.009026
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5429379
# plot Shannon to BMI
shannon_bmi <- alphadiv %>%
  filter(alpha_metric=="Shannon") %>%
  ggplot(aes(x=as.numeric(BMI), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +
  theme(legend.position = "none") +
  xlab("BMI") +
  ylab("Shannon diversity")

# correlate Shannon to %BF
  ## white: correlation
vat_white <- alphadiv %>% filter(alpha_metric=="Shannon" & Ethnicity=="W")
cor.test(~value+as.numeric(`%BF`), data=vat_white, method="spearman",conf.level=0.95)
## Warning in eval(predvars, data, env): NAs introduced by coercion
## Warning in cor.test.default(x = c(3.47753869036769, 4.01093947025265,
## 3.77358789674242, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 3276.3, p-value = 0.001647
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6187299
 ## chinese: correlation
vat_chinese <- alphadiv %>% filter(alpha_metric=="Shannon" & Ethnicity=="EA") 
cor.test(~value+as.numeric(`%BF`), data=vat_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(3.75391221497536, 3.28392747785931,
## 3.65868347495839, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 1893, p-value = 0.7606
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.06890709
# plot Shannon to %BF
shannon_vat <- alphadiv %>%
  filter(alpha_metric=="Shannon") %>%
  ggplot(aes(x=as.numeric(`%BF`), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +    
  theme(legend.position = "none") +
  xlab("Body fat, %") +
  ylab("Shannon diversity")

# correlate Richness to BMI
  ## white: correlation
bmi_white <- alphadiv %>% filter(alpha_metric=="Richness" & Ethnicity=="W") 
cor.test(~value+BMI, data=bmi_white, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(237, 205, 237, 227, 216, 291, 151, 225, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 3804, p-value = 0.0005289
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6539047
  ## chinese: correlations
bmi_chinese <- alphadiv %>% filter(alpha_metric=="Richness" & Ethnicity=="EA") 
cor.test(~value+BMI, data=bmi_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(271, 147, 196, 181, 268, 300, 281, 160, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and BMI
## S = 2318.3, p-value = 0.1617
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.3090396
# plot Richness to BMI
richness_bmi <- alphadiv %>%
  filter(alpha_metric=="Richness") %>%
  ggplot(aes(x=as.numeric(BMI), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +
  theme(legend.position = "none") +
  xlab("BMI") +
  ylab("Richness")

# correlate Richness to %BF
  ## white: correlation
vat_white <- alphadiv %>% filter(alpha_metric=="Richness" & Ethnicity=="W")
cor.test(~value+as.numeric(`%BF`), data=vat_white, method="spearman",conf.level=0.95)
## Warning in eval(predvars, data, env): NAs introduced by coercion
## Warning in cor.test.default(x = c(237, 205, 227, 216, 291, 151, 225, 203, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 3252.6, p-value = 0.00213
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.6070193
 ## chinese: correlation
vat_chinese <- alphadiv %>% filter(alpha_metric=="Richness" & Ethnicity=="EA") 
cor.test(~value+as.numeric(`%BF`), data=vat_chinese, method="spearman",conf.level=0.95)
## Warning in cor.test.default(x = c(271, 147, 196, 181, 268, 300, 281, 160, :
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  value and as.numeric(`%BF`)
## S = 1708, p-value = 0.8751
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.03558317
# plot Richness to %BF
richness_vat <- alphadiv %>%
  filter(alpha_metric=="Richness") %>%
  ggplot(aes(x=as.numeric(`%BF`), y=value, fill=Ethnicity)) +
  stat_smooth(method="lm", color="black", size=1) +
  geom_point(size=2, shape=21) +
  facet_wrap(~Ethnicity) +
  scale_fill_manual(values = c(EA=Chinesecolor, W=Whitecolor)) +
  theme_pcoa() +
  theme(legend.position = "none") +
  xlab("Body fat, %") +
  ylab("Richness")

# Combine panels
(faith_bmi | faith_vat) / (richness_bmi | richness_vat) / (shannon_bmi | shannon_vat) + plot_annotation(tag_levels = "A")
## `geom_smooth()` using formula 'y ~ x'
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/figures_16S_human/suppl_fig4_alphadiv_metabolic_corr.pdf", height=6, width=6, useDingbats=F)